# topic 11 f # Load the functions we will need in this script source( "../gnrnd5.R") source( "../assess_normality.R") source( "../pop_sd.R") # # Here we are looking to answer questions about # whether or not a population is "normal", that is, # does the population distribution look like the # normal probability curve? # The first thing to recognize is that no finite population # will ever be exactly normal. The best we can do is to find # populations that are approximately normal. For example, # here is a population of 9500 values that is approximately # normal with mean about 45 and standard deviation about 11. gnrnd5(269834949904, 1100004500) # look at the first 50 values head( L1, 50) # look at the last 50 values tail( L1, 50 ) # find the mean and standard deviation mean( L1 ) pop_sd( L1 ) # look at a historgram of the population hist( L1 ) # look at a boxplot of the population boxplot( L1, horizontal = TRUE) # just to make our example a little easier, we can transform # our population into a new one that has mean=45 and sd=11 L2 <- (L1 - mean(L1))/pop_sd(L1) # L2 will have mean=0 # sd=1 mean( L2 ) pop_sd( L2 ) L3 <- L2*11 + 45 # L3 should have mean=45 and sd=11 mean( L3 ) pop_sd( L3 ) # What are the things that we know about a true normal # distribution? # First, it should look like the normal distribution. # Here is a plot of a N(45,11) distribution ###################################### ## making this plot is beyond to ## ## requirements of this course. ## ###################################### x <- seq(1, 89, length=400) hx <- dnorm(x, mean=45, sd=11) plot(x, hx, type="l", lty=1, xlim=c(0,90), xaxp=c(0,90,18), ylim=c(0,0.05), yaxp=c(0,0.05,10), las=2, cex.axis=0.75, xlab="z value", ylab="Density", main="N(45,11) Distribution") abline( v=0, lty=1, lwd=2) abline( h=0, lty=1, lwd=2) abline( h=seq(0.005,0.05,0.005), lty=3, col="darkgray") abline( v=seq(5,90,5), lty=3, col="darkgray") # # then add a red vertical line at the place where we # have both the mean and the median lines( c(45, 45),c(0, dnorm(45, mean=45, sd=11)), col="red", lw=2) # going down and up one standard deviation from the mean # would give us the x values of 45-11=34 and 45+11=56. # Put green vertical liens there. lines( c(34, 34),c(0, dnorm(34, mean=45, sd=11)), col="green", lw=2) lines( c(56, 56),c(0, dnorm(56, mean=45, sd=11)), col="green", lw=2) # going down and up two standard deviations from the mean # would give us the x values of 45-2*11=23 and 45+2*11=67. # Put blue vertical lines there. lines( c(23, 23),c(0, dnorm(23, mean=45, sd=11)), col="blue", lw=2) lines( c(67, 67),c(0, dnorm(67, mean=45, sd=11)), col="blue", lw=2) # from what we know of the normal distribution the area under the # curve to the left of the left blue line should be pnorm( 23, mean=45, sd=11) # this will also be the area to the right of the right blue line pnorm( 67, mean=45, sd=11, lower.tail=FALSE) # We want to compare our population, L3, to this kind of a # distribution. To do this, we will first get a sorted # listing of our population values pop_sorted <- sort( L3 ) # then we should have about 2.27% of those values that are # less than 23. There are 9500 values in our population # so we can find 2.27% of that number 0.0277*9500 # and how many do we have that are less than or equal to 23? less_than_23 <- pop_sorted[ pop_sorted<=23 ] length( less_than_23 ) # we have 210 values but we expected 263. Not the same # but not terribly off. # let us see how many are greater than or equal to 67 greater_than_67 <- pop_sorted[ pop_sorted>=67 ] length( greater_than_67 ) # we have 205 values but we expected 263. Not the same # but not terribly off. # We can do the same for the area to the left and right of # the green lines pnorm( 34, mean=45, sd=11) # pnorm( 56, mean=45, sd=11, lower.tail=FALSE) # # so we can compute how many we would expect out our 9500 0.1586553*9500 # and we can see how many of our values are in those ranges less_than_34 <- pop_sorted[ pop_sorted<=34 ] length( less_than_34 ) # that was ridiculously close to what we expect. greater_than_56 <- pop_sorted[ pop_sorted>=56 ] length( greater_than_56 ) # again, really close # then, just for the sake of looking at things, # what is the median of our population? We expect it # to be the same as the mean. median( L3 ) # that is really good. # to get a little bit more detail, let us look at # quantiles both in the ideal Normal case and in # our population our_points <- seq( 0.1, 0.9, 0.1) our_points # these two commands should give about the same results qnorm( our_points, mean=45, sd=11) quantile( pop_sorted, our_points ) # This looks really good. # do this again, but at more percentage points our_points <- seq( 0.05, 0.95, 0.05) our_points # these two commands should give about the same results qnorm( our_points, mean=45, sd=11) quantile( pop_sorted, our_points ) # This looks really good. # we could do the same thing again, this time at percentage # points 1%, 2%, 3%, ..., 99% but that is just a lot of points, # many more than we want to compare. But we could do that # as a graph. our_points <- seq( 0.01, 0.99, 0.01) our_points # these two commands should give about the same results x <- qnorm( our_points, mean=45, sd=11) y <- quantile( pop_sorted, our_points ) plot( x,y ) # If our population is a Normal distribution then the # plot should look like a straight line of points. # This is, essentially, what our assess_normality # function does. assess_normality( L3 ) # The difference is that our previous plot had 99 points whereas # the assess_normality plot has one point for each value in our # population, in this case 9500 points. # Of course, we could have looked at a box and whisker # plot to see if the data looked normal boxplot( L3, horizontal=TRUE) # For a Normal distribution the two boxes should be the # same size, and the length of whiskers should be about # 3 times the width of each of the two boxes, and, if there # are many items in the population we do expect to see # some outliers. # Alternatively, we could look at a histogram of the # population. hist( L3, main="L3 with the default breakpoint" ) # But remember that histograms can change their appearance # if we change the break points. hist( L3, breaks=seq(0,90,4.5), main="L3 with interval width=4.5") hist( L3, breaks=seq(0,90,3), main="L3 with interval width=3") hist( L3, breaks=seq(0,90,1.5), main="L3 with interval width=1.5") hist( L3, breaks=seq(0,90,1), main="L3 with interval width=1") ####################################### # Now look at a different population, this one is not # normal. gnrnd5( 294706944905, 800002500, 2300009000) hist( L1 ) boxplot( L1, horizontal=TRUE) assess_normality( L1 )